library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggExtra) ; library(ggpubr)
library(biomaRt) ; library(DESeq2) ; library(sva) ; library(vsn) ; library(WGCNA) ;
library(GEOquery) ; library(lumi) ; library(limma)
library(dendextend) ; library(expss)
library(knitr) ; library(kableExtra)
Dataset downloaded from Arkinglab website in the Transcriptome analysis reveals dysregulation of innate immune response genes and neuronal activity-dependent genes in autism section.
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
###############################################################################################################
# DOWNLOAD AND CLEAN GENE EXPRESSION DATA
# Gene expression data downloaded directly from GEO because the one returned with get GEO was already preprocessed
datExpr = lumiR('./../Data/GSE28521_non-normalized_data.txt.gz') %>% as.matrix
## Perform Quality Control assessment of the LumiBatch object ...
rownames_datExpr = rownames(datExpr)
datExpr = datExpr %>% data.frame %>% mutate_all(function(x) x %>% as.numeric)
rownames(datExpr) = rownames_datExpr
# Download Metadata
GEO_data = getGEO('GSE28521', destdir='./../Data/')[[1]]
#datExpr = exprs(GEO_data) %>% data.frame # Already filtered and normalised data
#datGenes = fData(GEO_data) # Only includes information for the filtered genes
datMeta = pData(GEO_data) %>% mutate('ID' = geo_accession, 'Brain_lobe' = `tissue (brain region):ch1`,
'Diagnosis' = factor(ifelse(`disease status:ch1`=='autism', 'ASD','CTL'),
levels = c('CTL','ASD')),
'Subject_ID' = substring(title,1,9)) %>%
dplyr::select(ID, title, Subject_ID, description, Diagnosis, Brain_lobe)
rownames(datMeta) = paste0('X',datMeta$description)
# LABEL GENES WITH ENSEMBL IDS (THEY COME WITH ILLUMINA IDS)
# Get Biomart information
ensembl = useMart('ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
getinfo = c('illumina_humanref_8_v3', 'ensembl_gene_id','external_gene_id', 'entrezgene', 'hgnc_symbol',
'chromosome_name', 'start_position', 'end_position')
datGenes = getBM(attributes = getinfo, filters='illumina_humanref_8_v3', values = rownames(datExpr),
mart = ensembl)
datGenes = datGenes %>% left_join(NCBI_biotype %>% dplyr::select(-hgnc_symbol), by = 'ensembl_gene_id') %>%
add_count(illumina_humanref_8_v3) %>% filter(n == 1 | gene_biotype == 'protein_coding') %>%
distinct(illumina_humanref_8_v3, .keep_all = TRUE) %>%
mutate(length = end_position - start_position)
# Match DatExpr and datGenes rows, and datExpr columns and datMeta rows
datExpr = datExpr[rownames(datExpr) %in% datGenes$illumina_humanref_8_v3,]
datGenes = datGenes[match(rownames(datExpr), datGenes$illumina_humanref_8_v3),]
datMeta = datMeta[match(colnames(datExpr),rownames(datMeta)),]
###############################################################################################################
# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_01-03-2020_w_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# NCBI biotype annotation
NCBI_biotype = read.csv('./../../../NCBI/Data/gene_biotype_info.csv') %>%
dplyr::rename('ensembl_gene_id'=Ensembl_gene_identifier, 'gene_biotype'=type_of_gene,
'hgnc_symbol'=Symbol) %>%
mutate(gene_biotype = ifelse(gene_biotype=='protein-coding','protein_coding', gene_biotype))
gg_colour_hue = function(n) {
hues = seq(15, 375, length = n+1)
pal = hcl(h = hues, l = 65, c = 100)[1:n]
}
rm(GEO_data, GO_annotations, ensembl, getinfo, mart, rownames_datExpr)
RNA-Seq for 79 cortical brain-tissue samples across frontal, temporal lobes and cerebellum, comprising 29 samples from control subjects and 29 samples from ASD subjects
The dataset includes 23527 genes from 79 samples belonging to 33 different subjects.
Counts distribution: Heavy right tail
counts = datExpr %>% melt %>% mutate(value = value %>% as.numeric)
count_distr = data.frame('Statistic' = c('Min', '1st Quartile', 'Median', 'Mean', '3rd Quartile', 'Max'),
'Values' = c(min(counts$value), quantile(counts$value, probs = c(.25, .5)) %>% unname,
mean(counts$value), quantile(counts$value, probs = c(.75)) %>% unname,
max(counts$value)))
count_distr %>% kable(digits = 2, format.args = list(scientific = FALSE)) %>% kable_styling(full_width = F)
| Statistic | Values |
|---|---|
| Min | 78.38 |
| 1st Quartile | 245.22 |
| Median | 375.93 |
| Mean | 1487.09 |
| 3rd Quartile | 1051.63 |
| Max | 54121.42 |
rm(counts, count_distr)
Diagnosis distribution by Sample: Balanced
table_info = datMeta %>% apply_labels(Diagnosis = 'Diagnosis', Brain_lobe = 'Brain Lobe')
cro(table_info$Diagnosis)
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 40 |
| ASD | 39 |
| #Total cases | 79 |
Diagnosis distribution by Subject: Balanced
cro(table_info$Diagnosis[!duplicated(table_info$Subject_ID)])
| #Total | |
|---|---|
| Diagnosis | |
| CTL | 17 |
| ASD | 16 |
| #Total cases | 33 |
Brain region distribution: The Frontal lobe has more samples, but they are quite balanced
cro(table_info$Brain_lobe)
| #Total | |
|---|---|
| Brain Lobe | |
| Cerebellum | 21 |
| Frontal cortex | 32 |
| Temporal cortex | 26 |
| #Total cases | 79 |
Balanced
cro(table_info$Diagnosis, list(table_info$Brain_lobe,total()))
| Brain Lobe | #Total | ||||
|---|---|---|---|---|---|
| Cerebellum | Frontal cortex | Temporal cortex | |||
| Diagnosis | |||||
| CTL | 11 | 16 | 13 | 40 | |
| ASD | 10 | 16 | 13 | 39 | |
| #Total cases | 21 | 32 | 26 | 79 | |
Note: No age or gender information in the metadata :/
I was originally running this with the feb2014 version of BioMart because that’s the one that Gandal used (and it finds all of the Ensembl IDs, which other versions don’t), but it has some outdated biotype annotations, to fix this I’ll obtain all the information except the biotype label from BioMart in the same way as it had been done before, and then I’ll add the most current biotype label using information from NCBI’s website and then from BioMart in the following way:
Use BioMart to run a query with the original feb2014 version using the Ensembl IDs as keys to obtain all the information except the biotype labels (we are not going to retrieve the gene name from biomart because we already extracted it from datExpr)
Annotate genes with Biotype labels:
2.1 Use the NCBI annotations downloaded from NCBI’s website and processed in NCBI/RMarkdowns/clean_data.html (there is information for only 26K genes, so some genes will remain unlabelled)
2.2 Use the current version (jan2020) to obtain the biotype annotations using the Ensembl ID as keys (some genes don’t return a match)
2.3 For the genes that didn’t return a match, use the current version (jan2020) to obtain the biotype annotations using the gene name as keys
2.4 For the genes that returned multiple labels, use the feb2014 version with the Ensembl IDs as keys
Note: A small proportion of genes don’t make a match in any of these queries, so they will be lost when we start filtering out genes
labels_source = data.frame('source' = c('NCBI', 'BioMart2020_byID', 'BioMart2020_byGene', 'BioMart2014'),
'n_matches' = rep(0,4))
########################################################################################
# 1. Query archive version
# We already did this part when loading the data
cat(paste0('1. ', sum(is.na(datGenes$start_position)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the feb2014 version of BioMart'))
## 1. 0/23527 Ensembl IDs weren't found in the feb2014 version of BioMart
########################################################################################
########################################################################################
# 2. Get Biotype Labels
cat('2. Add biotype information')
## 2. Add biotype information
########################################################################################
# 2.1 Add NCBI annotations
# Already did this as well
cat(paste0('2.1 ' , sum(is.na(datGenes$gene_biotype)), '/', nrow(datGenes),
' Ensembl IDs weren\'t found in the NCBI database'))
## 2.1 414/23527 Ensembl IDs weren't found in the NCBI database
labels_source$n_matches[1] = sum(!is.na(datGenes$gene_biotype))
########################################################################################
# 2.2 Query current BioMart version for gene_biotype using Ensembl ID as key
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'jan2020.archive.ensembl.org')
datGenes_biotype = getBM(attributes = getinfo, filters = c('ensembl_gene_id'), mart=mart,
values = datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
cat(paste0('2.2 ' , sum(is.na(datGenes$gene_biotype))-nrow(datGenes_biotype), '/',
sum(is.na(datGenes$gene_biotype)),
' Ensembl IDs weren\'t found in the jan2020 version of BioMart when querying by Ensembl ID'))
## 2.2 217/414 Ensembl IDs weren't found in the jan2020 version of BioMart when querying by Ensembl ID
# Add new gene_biotype info to datGenes
datGenes = datGenes %>% left_join(datGenes_biotype, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(as.character(gene_biotype.x), gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[2] = sum(!is.na(datGenes$gene_biotype)) - labels_source$n_matches[1]
########################################################################################
# 3. Query current BioMart version for gene_biotype using gene symbol as key
missing_genes = unique(datGenes$external_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('hgnc_symbol','gene_biotype')
datGenes_biotype_by_gene = getBM(attributes = getinfo, filters = c('hgnc_symbol'), mart = mart,
values = missing_genes)
cat(paste0('2.3 ', length(missing_genes)-length(unique(datGenes_biotype_by_gene$hgnc_symbol)),'/',
length(missing_genes),
' genes weren\'t found in the current BioMart version when querying by gene name'))
## 2.3 40/169 genes weren't found in the current BioMart version when querying by gene name
dups = unique(datGenes_biotype_by_gene$hgnc_symbol[duplicated(datGenes_biotype_by_gene$hgnc_symbol)])
cat(paste0(' ', length(dups), ' genes returned multiple labels (these won\'t be added)'))
## 2 genes returned multiple labels (these won't be added)
# Update information
datGenes_biotype_by_gene = datGenes_biotype_by_gene %>% filter(!hgnc_symbol %in% dups)
datGenes = datGenes %>% left_join(datGenes_biotype_by_gene, by=c('external_gene_id'='hgnc_symbol')) %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y) %>%
mutate(hgnc_symbol = ifelse(is.na(hgnc_symbol), external_gene_id, hgnc_symbol))
labels_source$n_matches[3] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# 4. Query feb2014 BioMart version for the missing biotypes
missing_ensembl_ids = unique(datGenes$ensembl_gene_id[is.na(datGenes$gene_biotype)])
getinfo = c('ensembl_gene_id','gene_biotype')
mart = useMart(biomart = 'ENSEMBL_MART_ENSEMBL', dataset = 'hsapiens_gene_ensembl',
host = 'feb2014.archive.ensembl.org')
datGenes_biotype_archive = getBM(attributes = getinfo, filters=c('ensembl_gene_id'),
values = missing_ensembl_ids, mart=mart)
cat(paste0('2.4 ', length(missing_ensembl_ids)-nrow(datGenes_biotype_archive),'/',length(missing_ensembl_ids),
' genes weren\'t found in the feb2014 BioMart version when querying by Ensembl ID'))
## 2.4 0/42 genes weren't found in the feb2014 BioMart version when querying by Ensembl ID
# Update information
datGenes = datGenes %>% left_join(datGenes_biotype_archive, by='ensembl_gene_id') %>%
mutate(gene_biotype = coalesce(gene_biotype.x, gene_biotype.y)) %>%
dplyr::select(-gene_biotype.x, -gene_biotype.y)
labels_source$n_matches[4] = sum(!is.na(datGenes$gene_biotype)) - sum(labels_source$n_matches)
########################################################################################
# Plot results
labels_source = labels_source %>% add_row(source = 'missing',
n_matches = nrow(datGenes) - sum(labels_source$n_matches)) %>%
mutate(x = 1, percentage = round(100*n_matches/sum(n_matches),2),
source = factor(source, levels=c('BioMart2014','BioMart2020_byGene','BioMart2020_byID',
'NCBI','missing')))
p = labels_source %>% ggplot(aes(x, percentage, fill=source)) + geom_bar(position='stack', stat='identity') +
theme_minimal() + coord_flip() + theme(legend.position='bottom', axis.title.y=element_blank(),
axis.text.y=element_blank(), axis.ticks.y=element_blank()) + ylab('Percentage of genes') +
scale_fill_manual(values = c(gg_colour_hue(nrow(labels_source)-1),'gray'))
ggplotly(p + theme(legend.position='none'))
as_ggplot(get_legend(p))
########################################################################################
# Reorder rows to match datExpr
datGenes = datGenes[match(rownames(datExpr), datGenes$illumina_humanref_8_v3),]
rm(getinfo, mart, datGenes_BM, datGenes_biotype, datGenes_biotype_by_gene, datGenes_biotype_archive,
dups, missing_ensembl_ids, missing_genes, labels_source, p)
Checking how many SFARI genes are in the dataset
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
n_SFARI = df[['gene-symbol']] %>% unique %>% length
Considering all genes, this dataset contains 860 of the 912 SFARI genes
1.- Filter entries for which we didn’t manage to obtain its genotype information
1.1 Missing Biotype
to_keep = !is.na(datGenes$gene_biotype)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
Removed 0 ‘genes’, 23527 remaining
Filtering genes without biotype information, we are left with 860 SFARI Genes (we lose 0 genes)
2.- Filter genes that do not encode any protein
98% of the genes are protein coding genes
datGenes$gene_biotype %>% table %>% sort(decreasing=TRUE) %>% kable(caption='Biotypes of genes in dataset') %>%
kable_styling(full_width = F)
| . | Freq |
|---|---|
| protein_coding | 23049 |
| 1 | 157 |
| 3 | 119 |
| processed_pseudogene | 66 |
| lncRNA | 60 |
| transcribed_unprocessed_pseudogene | 28 |
| unprocessed_pseudogene | 20 |
| transcribed_processed_pseudogene | 14 |
| lincRNA | 4 |
| transcribed_unitary_pseudogene | 4 |
| pseudogene | 3 |
| TEC | 2 |
| polymorphic_pseudogene | 1 |
Non-protein coding genes in general have lower levels of expression than protein coding genes, but the difference is not that big
plot_data = data.frame('ID' = rownames(datExpr), 'MeanExpr' = rowMeans(datExpr),
'ProteinCoding' = datGenes$gene_biotype=='protein_coding')
ggplotly(plot_data %>% ggplot(aes(log2(MeanExpr+1), fill=ProteinCoding, color=ProteinCoding)) +
geom_density(alpha=0.5) + theme_minimal())
rm(plot_data)
df = SFARI_genes %>% dplyr::select(-gene_biotype) %>% inner_join(datGenes, by=c('ID'='ensembl_gene_id'))
Filtering protein coding genes, we are left with 860 SFARI Genes (we lose 0 genes)
if(!all(rownames(datExpr)==rownames(datGenes))) cat('!!! gene rownames do not match!!!')
## !!! gene rownames do not match!!!
to_keep = datGenes$gene_biotype == 'protein_coding'
datExpr = datExpr %>% filter(to_keep)
datGenes = datGenes %>% filter(to_keep)
Removed 478 genes. 23049 remaining
3.- Filter genes with low expression levels
Choosing the threshold:
Criteria for selecting filtering threshold: The minimum value in which the preprocessed data is relatively homoscedastic (we’re trying to get rid of the group of genes with very low mean and SD that make the cloud of points look like a comic book speech bubble)
datMeta_original = datMeta
datExpr_original = datExpr
datGenes_original = datGenes
thresholds = c(100, 200, 220, 250, 270, 300)
for(threshold in thresholds){
datMeta = datMeta_original
datExpr = datExpr_original
datGenes = datGenes_original
to_keep = apply(datExpr, 1, function(x) mean(x) >= threshold)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
# Filter outlier samples
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# Normaise data using variance stabilisation normalisation
LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
pData(LumiBatch) = datMeta
LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)
datExpr = exprs(LumiBatch)
rm(absadj, netsummary, ku, z.ku, to_keep, LumiBatch)
# Save summary results in dataframe
if(threshold == thresholds[1]){
mean_vs_sd_data = data.frame('threshold' = threshold, 'ID' = rownames(datExpr),
'Mean' = rowMeans(datExpr), 'SD' = apply(datExpr,1,sd))
} else {
new_entries = data.frame('threshold' = threshold, 'ID' = rownames(datExpr),
'Mean' = rowMeans(datExpr), 'SD' = apply(datExpr,1,sd))
mean_vs_sd_data = rbind(mean_vs_sd_data, new_entries)
}
}
# Visualise the effects of different thresholds
to_keep_1 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean<7] %>%
as.character
to_keep_2 = mean_vs_sd_data$ID[mean_vs_sd_data$threshold==thresholds[1] & mean_vs_sd_data$Mean>=7]
to_keep_2 = to_keep_2 %>% sample(round(length(to_keep_2)/10)) %>% as.character
plot_data = mean_vs_sd_data[mean_vs_sd_data$ID %in% c(to_keep_1, to_keep_2),]
ggplotly(plot_data %>% ggplot(aes(Mean, SD)) +
geom_point(color='#0099cc', alpha=0.2, aes(id=ID, frame=threshold)) +
scale_x_log10() + scale_y_log10() + theme_minimal())
# Plot remaining genes
plot_data = mean_vs_sd_data %>% group_by(threshold) %>% tally
ggplotly(plot_data %>% ggplot(aes(threshold, n)) + geom_point() + geom_line() +
theme_minimal() + ggtitle('Remaining genes for each filtering threshold'))
rm(to_keep_1, to_keep_2, plot_data, dds, thresholds)
# Return to original variables
datExpr = datExpr_original
datGenes = datGenes_original
datMeta = datMeta_original
rm(datExpr_original, datGenes_original, datMeta_original)
Selecting a threshold of 220
# Minimum percentage of non-zero entries allowed per gene
threshold = 220
plot_data = data.frame('id'=rownames(datExpr),
'threshold' = apply(datExpr, 1, function(x) mean(x)))
ggplotly(plot_data %>% ggplot(aes(x=threshold)) +
geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
geom_vline(xintercept=threshold, color='gray') +
xlab('Mean level of expression') + ylab('Density') + scale_x_log10() +
ggtitle('Mean level of expression by Gene') + theme_minimal())
to_keep = apply(datExpr, 1, function(x) mean(x) >= threshold)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
Removed 2652 genes. 20397 remaining
825 SFARI genes remaining (we lost 35 genes)
n_SFARI = SFARI_genes[['gene-symbol']][SFARI_genes$ID %in%datGenes$ensembl_gene_id] %>% unique %>% length
rm(threshold, plot_data, to_keep)
4.- Filter outlier samples
Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)
Gandal uses the formula \(s_{ij}=\frac{1+bw(i,j)}{2}\) to convert all the weights to positive values, but I used \(s_{ij}=|bw(i,j)|\) instead because I think it makes more sense. In the end it doesn’t matter because they select as outliers the same six samples
5 outliers
absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$ID,
'Subject_ID'=datMeta$Subject_ID, 'Brain_Lobe'=datMeta$Brain_lobe,
'Diagnosis'=datMeta$Diagnosis)
selectable_scatter_plot(plot_data, plot_data[,-c(1:3)])
Outlier samples: GSM706410, GSM706394, GSM706425, GSM706391, GSM706455
to_keep = z.ku > -2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
rm(absadj, netsummary, ku, z.ku, plot_data)
Removed 5 samples, 74 remaining
rm(to_keep)
5.- Filter repeated genes
There are 4939 genes with more than one ensembl ID in the dataset. To accurately refer to the rows of my data as ‘genes’, I’m going to remove the repeated ones.
dup_genes = datGenes$hgnc_symbol %>% duplicated
datGenes = datGenes[!dup_genes,]
datExpr = datExpr[!dup_genes,]
Removed 4939 genes. 15458 remaining
825 SFARI genes remaining (we lost 0 genes)
rm(dup_genes, n_SFARI)
After filtering, the dataset consists of 15458 genes and 74 samples
save(datExpr, datMeta, datGenes, file='./../Data/filtered_raw_data.RData')
#load('./../Data/filtered_raw_data.RData')
There are no batch surrogate variables in this dataset
Cerebellum samples seem to cluster together. There doesn’t seem to be a strong pattern related to Diagnosis
h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram
dend_meta = datMeta[match(labels(h_clusts), rownames(datMeta)),] %>%
mutate('Diagnosis' = ifelse(Diagnosis=='CTL','#008080','#86b300'), # Blue control, Green ASD
'Region' = case_when(Brain_lobe=='Frontal cortex'~'#F8766D', # ggplot defaults for 3 colours
Brain_lobe=='Temporal cortex'~'#00BA38',
TRUE~'#619CFF')) %>%
dplyr::select(Region, Diagnosis)
h_clusts %>% dendextend::set('labels', rep('', nrow(datMeta))) %>%
dendextend::set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)
rm(h_clusts, dend_meta)
Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2:
Create a lumi object
# Normaise data using variance stabilisation normalisation
LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
pData(LumiBatch) = datMeta
LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)
datExpr_norm = exprs(LumiBatch)
Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.
# Perform vst
mod = model.matrix(~ Diagnosis, datMeta)
mod0 = model.matrix(~ 1, datMeta)
sva_fit = sva(datExpr_norm, mod=mod, mod0=mod0)
## Number of significant surrogate variables is: 7
## Iteration (out of 5 ):1 2 3 4 5
rm(mod, mod0)
Found 7 surrogate variables, since there is no direct way to select which ones to pick Bioconductor answer, kept all of them.
Include SV estimations to datMeta information
sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV', 1:ncol(sv_data))
datMeta_sva = cbind(datMeta, sv_data)
rm(sv_data, sva_fit)
Using the lumi package to perform normalisation
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
Using vsn to normalise the data
# Normaise data using variance stabilisation normalisation
LumiBatch = ExpressionSet(assayData = datExpr %>% as.matrix)
pData(LumiBatch) = datMeta
LumiBatch = lumiN(LumiBatch, method = 'vsn', verbose = FALSE)
datExpr_vst = exprs(LumiBatch)
# DEA
fit = lmFit(datExpr_vst, design=model.matrix(~Diagnosis, data = datMeta))
efit = eBayes(fit)
DE_info = topTable(efit,coef=2, number=Inf, sort.by='none')
rm(LumiBatch, fit)
Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic
meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()
Plotting points individually we can notice some heteroscedasticity in the data in the genes with the lowest levels of expression
plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))
plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.2) + geom_smooth(color = 'gray') +
scale_x_log10() + scale_y_log10() + theme_minimal()
rm(plot_data)
Rename normalised datasets to continue working with these
datExpr = datExpr_vst
datMeta = datMeta_sva %>% data.frame
#datGenes = datGenes_vst
rm(datExpr_vst, datMeta_vst, datMeta_sva)
## Warning in rm(datExpr_vst, datMeta_vst, datMeta_sva): object 'datMeta_vst' not
## found
In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data. The only thing everyone agrees on is that you shouldn’t remove them before performing DEA but instead include them in the model.
Based on the conclusions from Practical impacts of genomic data “cleaning” on biological discovery using surrogate variable analysis it seems like it may be a good idea to remove the batch effects from the data and not only from the DE analysis:
Using SVA, ComBat or related tools can increase the power to identify specific signals in complex genomic datasets (they found “greatly sharpened global and gene-specific differential expression across treatment groups”)
But caution should be exercised to avoid removing biological signal of interest
We must be precise and deliberate in the design and analysis of experiments and the resulting data, and also mindful of the limitations we impose with our own perspective
Open data exploration is not possible after such supervised “cleaning”, because effects beyond those stipulated by the researcher may have been removed
# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
X = cbind(mod, svs)
Hat = solve(t(X) %*% X) %*% t(X)
beta = (Hat %*% t(datExpr))
rm(Hat)
gc()
P = ncol(mod)
return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}
pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp
# Correct
mod = model.matrix(~ Diagnosis, datMeta)
svs = datMeta %>% dplyr::select(contains('SV')) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)
pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp
rm(correctDatExpr)
The data is divided into two very different groups but the SVA manages to join them together and make Diagnosis the primary characteristic that separetes them
pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
'PC2'=pca_samples_before$x[,2], 'corrected'=0),
data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')
ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) +
xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
round(100*summary(pca_samples_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
round(100*summary(pca_samples_after)$importance[2,2],1))) +
ggtitle('Samples') + theme_minimal())
The group of samples that has a very different behaviour to the rest of the genes at the beginning are the ones from the Cerebellum. After performing SVA, they are no longer recognisable
ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=corrected, id=ID), alpha=0.75) +
xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
round(100*summary(pca_samples_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
round(100*summary(pca_samples_after)$importance[2,2],1))) +
ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)
It seems like the sva correction preserves the mean expression of the genes and erases almost everything else (although what little else remains is enough to characterise the two Diagnosis groups pretty well using only the first PC)
*Plot is done with only 10% of the genes so it’s not that heavy
pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_after$x[,1],
'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))
keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))
pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)
ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) +
geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
round(100*summary(pca_genes_after)$importance[2,1],1))) +
ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
'). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
round(100*summary(pca_genes_after)$importance[2,2],1))) +
scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_genes_df, keep_genes)
Everything looks good, so we’re keeping the corrected expression dataset
datExpr = datExpr_corrected
rm(datExpr_corrected)
save(datExpr, datMeta, datGenes, DE_info, efit, file='./../Data/preprocessed_data.RData')
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] kableExtra_1.1.0 knitr_1.28
## [3] expss_0.10.2 dendextend_1.13.4
## [5] limma_3.40.6 lumi_2.36.0
## [7] GEOquery_2.52.0 WGCNA_1.69
## [9] fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [11] vsn_3.52.0 sva_3.32.1
## [13] genefilter_1.66.0 mgcv_1.8-31
## [15] nlme_3.1-147 DESeq2_1.24.0
## [17] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [19] BiocParallel_1.18.1 matrixStats_0.56.0
## [21] Biobase_2.44.0 GenomicRanges_1.36.1
## [23] GenomeInfoDb_1.20.0 IRanges_2.18.3
## [25] S4Vectors_0.22.1 BiocGenerics_0.30.0
## [27] biomaRt_2.40.5 ggpubr_0.2.5
## [29] magrittr_1.5 ggExtra_0.9
## [31] GGally_1.5.0 gridExtra_2.3
## [33] viridis_0.5.1 viridisLite_0.3.0
## [35] RColorBrewer_1.1-2 plotlyutils_0.0.0.9000
## [37] plotly_4.9.2 glue_1.4.1
## [39] reshape2_1.4.4 forcats_0.5.0
## [41] stringr_1.4.0 dplyr_1.0.0
## [43] purrr_0.3.4 readr_1.3.1
## [45] tidyr_1.1.0 tibble_3.0.1
## [47] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.0 RSQLite_2.2.0 AnnotationDbi_1.46.1
## [4] htmlwidgets_1.5.1 grid_3.6.3 munsell_0.5.0
## [7] codetools_0.2-16 preprocessCore_1.46.0 nleqslv_3.3.2
## [10] miniUI_0.1.1.1 withr_2.2.0 colorspace_1.4-1
## [13] highr_0.8 rstudioapi_0.11 ggsignif_0.6.0
## [16] labeling_0.3 GenomeInfoDbData_1.2.1 farver_2.0.3
## [19] bit64_0.9-7 rhdf5_2.28.1 vctrs_0.3.1
## [22] generics_0.0.2 xfun_0.12 R6_2.4.1
## [25] doParallel_1.0.15 illuminaio_0.26.0 locfit_1.5-9.4
## [28] bitops_1.0-6 reshape_0.8.8 assertthat_0.2.1
## [31] promises_1.1.0 scales_1.1.1 nnet_7.3-14
## [34] gtable_0.3.0 affy_1.62.0 methylumi_2.30.0
## [37] rlang_0.4.6 splines_3.6.3 rtracklayer_1.44.4
## [40] lazyeval_0.2.2 acepack_1.4.1 impute_1.58.0
## [43] hexbin_1.28.1 broom_0.5.5 checkmate_2.0.0
## [46] BiocManager_1.30.10 yaml_2.2.1 modelr_0.1.6
## [49] crosstalk_1.1.0.1 GenomicFeatures_1.36.4 backports_1.1.8
## [52] httpuv_1.5.2 Hmisc_4.4-0 tools_3.6.3
## [55] nor1mix_1.3-0 affyio_1.54.0 ellipsis_0.3.1
## [58] siggenes_1.58.0 Rcpp_1.0.4.6 plyr_1.8.6
## [61] base64enc_0.1-3 progress_1.2.2 zlibbioc_1.30.0
## [64] RCurl_1.98-1.2 prettyunits_1.1.1 rpart_4.1-15
## [67] openssl_1.4.2 cowplot_1.0.0 bumphunter_1.26.0
## [70] haven_2.2.0 cluster_2.1.0 fs_1.4.0
## [73] data.table_1.12.8 reprex_0.3.0 hms_0.5.3
## [76] mime_0.9 evaluate_0.14 xtable_1.8-4
## [79] XML_3.99-0.3 jpeg_0.1-8.1 mclust_5.4.6
## [82] readxl_1.3.1 compiler_3.6.3 minfi_1.30.0
## [85] KernSmooth_2.23-17 crayon_1.3.4 htmltools_0.4.0
## [88] later_1.0.0 Formula_1.2-3 geneplotter_1.62.0
## [91] lubridate_1.7.4 DBI_1.1.0 dbplyr_1.4.2
## [94] MASS_7.3-51.6 Matrix_1.2-18 cli_2.0.2
## [97] quadprog_1.5-8 pkgconfig_2.0.3 GenomicAlignments_1.20.1
## [100] foreign_0.8-76 xml2_1.2.5 foreach_1.5.0
## [103] annotate_1.62.0 rngtools_1.5 webshot_0.5.2
## [106] multtest_2.40.0 beanplot_1.2 XVector_0.24.0
## [109] rvest_0.3.5 doRNG_1.8.2 scrime_1.3.5
## [112] digest_0.6.25 Biostrings_2.52.0 rmarkdown_2.1
## [115] base64_2.0 cellranger_1.1.0 htmlTable_1.13.3
## [118] DelayedMatrixStats_1.6.1 curl_4.3 Rsamtools_2.0.3
## [121] shiny_1.4.0.2 lifecycle_0.2.0 jsonlite_1.7.0
## [124] Rhdf5lib_1.6.3 askpass_1.1 fansi_0.4.1
## [127] pillar_1.4.4 lattice_0.20-41 fastmap_1.0.1
## [130] httr_1.4.1 survival_3.1-12 GO.db_3.8.2
## [133] png_0.1-7 iterators_1.0.12 bit_1.1-15.2
## [136] stringi_1.4.6 HDF5Array_1.12.3 blob_1.2.1
## [139] latticeExtra_0.6-29 memoise_1.1.0